In [1]:
import pandas as pd
import numpy as np
data=pd.read_csv('/Users/yangrenqin/Downloads/titanic-data.csv')
data.head(7)
Out[1]:
In [2]:
# pick out the information about the people who survived
survived=data[data.Survived==1]
survived.head(7)
Out[2]:
In [3]:
# According to the embark poistion, 'gourpby' data
# And count the number of people survived or not embarked through different poistion
embarked_survived=survived.groupby('Embarked').count().Survived
embarked_totalcounts=data.groupby('Embarked').count().Survived
survived_total=len(survived)/len(data)
In [4]:
sum(pd.isnull(data.Embarked))
Out[4]:
In [5]:
import matplotlib.pyplot as plt
%matplotlib inline
# compare the surviving rate of different embark poistion, including the whole population.
embarked_survivedrate=np.array(embarked_survived)/np.array(embarked_totalcounts)
all_survivedrate=np.append(embarked_survivedrate,survived_total)
labels=['C','Q','S','Total']
plt.bar(np.arange(4),all_survivedrate,align='center',tick_label=labels)
plt.xlabel('Embark position')
plt.ylabel('Survived rate (ratio)')
plt.title('The difference of survive rate as function of embark position')
for a,b in zip(np.arange(4),all_survivedrate):
c=str(b*100)[:5]+"%"
plt.text(a,b+0.01,c,horizontalalignment='center')
In [6]:
from scipy.stats import chi2_contingency
pivot_embarked = pd.pivot_table(data = data[['Survived', 'Embarked']], index = 'Survived', columns = ['Embarked'], aggfunc = len)
chi2, p_value, dof, expected = chi2_contingency(pivot_embarked)
print ("Results of Chi-Squared test on Embarked to Survival.")
print ("Does Embark position have a significant effect on Survival?")
print ("Chi-Squared Score = %f"%chi2)
print ("Pvalue = %s"%str(p_value))
In [7]:
pclass_survived=survived.groupby('Pclass').count().Survived
pclass_totalcounts=data.groupby('Pclass').count().Survived
In [8]:
sum(pd.isnull(data.Pclass))
Out[8]:
In [9]:
labels=['First class','Second class','Third class']
colors=['Khaki','LawnGreen','PowderBlue']
def make_autopct(values):
def my_autopct(pct):
total = sum(values)
val = int(round(pct*total/100.0))
return '{p:.2f}% ({v:d})'.format(p=pct,v=val)
return my_autopct
plt.title('Original Class Proportions')
plt.pie(pclass_totalcounts,labels=labels,colors=colors,autopct=make_autopct(pclass_totalcounts));
In [10]:
labels=['First class','Second class','Third class']
colors=['BlanchedAlmond','Aqua','DodgerBlue']
def make_autopct(values):
def my_autopct(pct):
total = sum(values)
val = int(round(pct*total/100.0))
return '{p:.2f}% ({v:d})'.format(p=pct,v=val)
return my_autopct
plt.title('Survived Class Proportions')
plt.pie(pclass_survived,labels=labels,colors=colors,autopct=make_autopct(pclass_survived));
In [11]:
pclass_survivedrate=np.array(pclass_survived)/np.array(pclass_totalcounts)
allpclass_survivedrate=np.append(pclass_survivedrate,survived_total)
labels=['1','2','3','Total']
plt.bar(np.arange(4),allpclass_survivedrate,align='center',tick_label=labels)
plt.xlabel('Passenger Class')
plt.ylabel('Survived rate (ratio)')
plt.title('The difference of survive rate as function of passengers socio-economic class')
for a,b in zip(np.arange(4),allpclass_survivedrate):
c=str(b*100)[:5]+"%"
plt.text(a,b+0.01,c,horizontalalignment='center')
In [12]:
from scipy.stats import chi2_contingency
pivot_pclass = pd.pivot_table(data = data[['Survived', 'Pclass']], index = 'Survived', columns = ['Pclass'], aggfunc = len)
chi2, p_value, dof, expected = chi2_contingency(pivot_pclass)
print ("Results of Chi-Squared test on Pclass to Survival.")
print ("Does Pclass have a significant effect on Survival?")
print ("Chi-Squared Score = %f"%chi2)
print ("Pvalue = %s"%str(p_value))
In [13]:
sex_survived=survived.groupby('Sex').count().Survived
sex_totalcounts=data.groupby('Sex').count().Survived
sex_rate=np.array(sex_survived)/np.array(sex_totalcounts)
In [14]:
sum(pd.isnull(survived.Sex)) # Also no missing data in sex column
Out[14]:
In [15]:
plt.bar([1.2,3.2],sex_totalcounts,width=0.8,color='green',label='Original total counts')
plt.bar([2,4],sex_survived,width=0.8,color='orange',label='Survived counts')
plt.xlim(0,6)
plt.ylim(0,650)
plt.xticks([2,4],('female','male'))
plt.legend(loc='best')
plt.ylabel('people counts')
plt.title('The number of people comparison before and after accident within different sex')
for a,b in zip([1.6,3.6],sex_totalcounts):
c=str(b)
plt.text(a,b+5,c,horizontalalignment='center')
for a,b in zip([2.4,4.4],sex_survived):
c=str(b)
plt.text(a,b+5,c,horizontalalignment='center')
In [16]:
plt.barh([1,2],sex_rate,align='center',color=['green','orange'])
plt.yticks([1,2],['female','male'])
plt.xlabel('survive rate(ratio)')
plt.title('The difference of survive rate as function of sex')
for a,b in zip([1,2],sex_rate):
c=str(b*100)[:5]+'%'
plt.text(b+0.05,a,c,horizontalalignment='center')
In [17]:
pivot_sex = pd.pivot_table(data = data[['Survived', 'Sex']], index = 'Survived', columns = ['Sex'], aggfunc = len)
chi2, p_value, dof, expected = chi2_contingency(pivot_sex)
print ("Results of Chi-Squared test on Sex to Survival.")
print ("Does Sex have a significant effect on Survival?")
print ("Chi-Squared Score = %f"%chi2)
print ("Pvalue = %s"%str(p_value))
In [ ]: